# topic 16 # This will be remarkably similar to topic 15, so # much so that it would be worth comparing the two scripts. # First, set up the situation. We have a population # with an unknown standard deviation. # source("../gnrnd5.R") source("../gnrnd4.R") gnrnd5( 179453199903, 873001418) big_pop <- L1 # I do not know the standard deviation of big_pop # Now, someone says that they believe that the # mean of big_pop is 200. Thus our null hypothesis # is that the true mean is 200. We want to see if that # could be correct. We will get a sample of # size 43 and we will look at the sample mean. # We are willing to be wrong in telling the # person that they are wrong 2.5% of the time! ######################## ## The critical value approach. We know that ## where the standard deviation of big_pop is sigma then ## samples of size 43 will have a standard error ## of the mean be sigma/sqrt(43) and that those ## values will be normally distributed with the ## same mean as the population, a value we assume ## to be 200. But we do not know the value of sigma. ## So we can approximate the distribution of the ## mean of samples of size 43 with a Student's-t ## with 42 degrees of freedom. ## This is a two-tailed test because ## a sample mean that is either too low or too high ## would indicate that 200 is not the mean. ## Therefore, find the value, in a Student's-t with ## 42 degrees for freedom that has 0.025/2 as the ## P(Xt), where we use the sample ## standard deviation rather than the population ## standard deviation. Because of this we need to ## take the sample first in order to find the sample ## standard deviation. # Get our sample # the first time we do this let us get the # same sample each time gnrnd4(768734201, 200000001) L1 # take those as the index values of our random sample our_samp <- big_pop[ L1 ] our_samp # find the mean of our sample x_bar <- mean( our_samp ) x_bar # find the standard deviation of the sample s_x <- sd( our_samp ) s_x # Now we can get our critical values # long way low_t <- qt(0.025/2,42) low_t low_val <- 200 + low_t*s_x/sqrt(43) # + because t is < 0 low_val high_t <- qt( 0.025/2,42, lower.tail=FALSE) high_t # this was silly because we know it is -low_t high_val <- 200 + high_t*s_x/sqrt(43) high_val ## so now compare the mean of the sample ## to our critical values. x_bar ## In this case the sample mean is greater than ## our critical high. Therefore reject, at the ## 0.025 level of significance, the ## null hypothesis that the true mean is 200 ## in favor of the alternative hypothesis that ## the true mean is not 200. ############# ############# the attained significance approach ## how strange would it be to get a mean of ## 207.0581 for a sample of size 43 if the true ## mean is 200? Because we do not have the ## flexibility in pt() that we have in pnorm() ## we will have to normalize our value. # pt( (207.0581 - 200 )/(s_x/sqrt(43)), 42, lower.tail=FALSE) # ## but we would need to double that to account for ## values that extreme or more extreme on the low ## side 0.005047642*2 # is that probability less than our 2.5% ? # Yes, therefore, reject the null hypothesis # in favor of the alternative. ####################### Now use the function to ####################### do the same thing source("../hypo_unknown.R") hypoth_test_unknown( 200,0, 0.025, 43, x_bar, s_x ) ####################################### ####################################### # Now we want to repeat this process # but each time we want a different sample # of size 35 L1 <- sample( big_pop, 43 ) x_bar <- mean( L1 ) s_x <- sd( L1 ) hypoth_test_unknown( 200, 0, 0.025, 43, x_bar, s_x ) #### perform lines 114-118 again and again ### now, since we have the population let us peek # at the true mean mean( big_pop ) ####### Try our samples again, but this time test ## the null hypothesis that the true mean ## is 207.0618, and do the test at the 0.05 ## level of significance. L1 <- sample( big_pop, 43 ) x_bar <- mean( L1 ) s_x <- sd( L1 ) hypoth_test_unknown( 207.0618, 0, 0.05, 43, x_bar, s_x ) #### perform lines 128-132 again and again, #### and we should see a Type I error about #### 5% of the time. ### we can actually do this 1000 times and see how ### times we reject the null hypothesis even ### though it is true. L2 <- 1:1000 for( i in 1:1000) { L1 <- sample( big_pop, 43 ) x_bar <- mean( L1 ) s_x <- sd( L1 ) answer <- hypoth_test_unknown( 207.0618, 0, 0.05, 43, x_bar, s_x ) L2[i] <- answer[14] } table( L2 )